In this notebook. only include the analysis of the whole data after filtering, and the integration part.

#REPO code for CF_seurat analysis
library(dplyr)
library(Seurat)
library(patchwork)
library(data.table)
library(R.utils)
library(ggplot2)
library(patchwork)
set.seed(412)

Read the data

#load gene matrix
mat<-Seurat::ReadMtx("./CF_data/GSE145360_Sputum.data.processed.mtx",cells="./CF_data/GSE145360_Sputum.processed.barcodes.tsv",
                     features="./CF_data/GSE145360_Sputum.processed.genes.tsv", feature.column = 1)
#load metadata
meta <- data.frame(fread("https://cells.ucsc.edu/human-sputum/meta.tsv"), row.names=1)

Data to Seurat object

# Initialize the Seurat object with the raw (non-normalized data).
#New seurat object for paper analysis
cf <- CreateSeuratObject(counts = mat, project = "cf", min.cells = 3, min.features = 200,
                           meta.data=meta)

QC

# 1. QC -------
Idents(cf) <- cf@meta.data$disease.ident
# get percent mitochondrial genes
cf[["percent.mt"]] <- PercentageFeatureSet(cf, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(cf, features = c("nFeature_RNA", "nCount_RNA", "percent.mt")
        #, ncol = 3
        ,group.by = NULL
        )
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

plot1 <- FeatureScatter(cf, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(cf, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")+geom_smooth(method = 'lm')
plot1 + plot2
## `geom_smooth()` using formula = 'y ~ x'

data filtering

# data filtering -----------------
cf.1 <- subset(cf, subset = nFeature_RNA>200 & nFeature_RNA < 7000 & percent.mt <5)

# split object
cf.1[["RNA"]] <- split(cf.1[["RNA"]], f = cf.1$disease.ident)

normalize data

# normalize data
cf.1 <- NormalizeData(cf.1)
## Normalizing layer: counts.CF
## Normalizing layer: counts.CTRL
# feature selection(find highly variable features)
cf.1 <- FindVariableFeatures(cf.1, selection.method = "vst", nfeatures = 3000)
## Finding variable features for layer counts.CF
## Finding variable features for layer counts.CTRL
# 
# ## Identify the 10 most highly variable genes
# top10 <- head(VariableFeatures(cf.1), 10)
# 
# ## plot variable features with and without labels
# plot1 <- VariableFeaturePlot(cf.1)
# plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
# plot1 + plot2

Data Scale

# scale to mean 0 and variance 1
all.genes <- rownames(cf.1)
cf.1<-ScaleData(cf.1, features=all.genes, verbose=F)
# regress out percentage number of mitochondrial genes
cf.1 <-ScaleData(cf.1, vars.to.regress = c("nFeature_RNA", "nCount_RNA","percent.mt"), do.scale=F, do.center=F, verbose=F)
## Warning: Different features in new layer data than already exists for
## scale.data

PCA dimensionaily reduction on subset of scaled variable features

#PCA dimensionaily reduction on subset of scaled variable features
cf.1 <- RunPCA(cf.1, features = VariableFeatures(object = cf.1))
## PC_ 1 
## Positive:  PLEK, ACSL1, DOCK4, CSF3R, FOS, TNFRSF1B, SLC2A3, IFITM2, G0S2, CHST11 
##     SAMSN1, FPR1, C1QA, AC006369.1, LUCAT1, APOC1, C1QB, MARCO, PRKCB, PFKFB3 
##     OLR1, SLC25A37, C1QC, FABP4, HLA-DPA1, HLA-DPB1, HLA-DQA1, PROK2, FABP5, KCNJ15 
## Negative:  SPRR3, KRT13, LCN2, ECM1, MAL, LYPD2, PSCA, TMPRSS11B, EMP1, KRT19 
##     SPRR2A, KRT78, SCEL, PRSS27, MUC4, PRSS22, DUSP5, ELF3, NDRG2, AC118754.1 
##     TMPRSS2, PARD3, MUC21, PITX1, PPL, FAM129B, SPINK5, FBXO32, C6orf132, TMPRSS11E 
## PC_ 2 
## Positive:  G0S2, BASP1, PTGS2, IFITM2, SLC25A37, RGS2, SAMSN1, CXCL8, PFKFB3, SLC2A3 
##     PRKCB, ZFP36, PROK2, LUCAT1, MXD1, CSF3R, ANTXR2, TNFRSF1B, ZFP36L1, MCTP2 
##     MARCKS, IER3, OSM, SIPA1L1, S100A12, PLEK, PLXNC1, FPR1, NFKBIA, IL1B 
## Negative:  FABP4, APOC1, C1QB, C1QA, APOE, MARCO, C1QC, SCD, NUPR1, FABP5 
##     LSAMP, PPARG, CD52, HLA-DQA1, CTSL, HLA-DPB1, HLA-DRA, FN1, CD74, ANO5 
##     TPRG1, HLA-DPA1, CCL18, HLA-DRB1, TXNIP, AL365295.1, RBP4, AC092691.1, ANXA1, LYZ 
## PC_ 3 
## Positive:  PROK2, MCTP2, ACSL1, ALPL, DYSF, FCGR3B, RUBCNL, S100A8, S100A9, KCNJ15 
##     PFKFB3, SORL1, IFRD1, SELL, CREB5, ADGRG3, CMTM2, AL049651.1, FABP4, CXCR2 
##     OSM, SLC25A37, IL1RAP, FPR1, LSAMP, PTGS2, AC105402.3, AL034397.3, S100A12, FPR2 
## Negative:  C15orf48, CCL3L1, FNIP2, CCL3, CCL4L2, CCL4, VEGFA, JARID2, SERPINB9, HIF1A-AS2 
##     IER3, CXCR4, GBE1, HES4, SLAMF7, AC025580.2, TLR2, CXCL2, TNFAIP3, TIMP1 
##     TRAF1, DOCK4, PID1, ICAM1, EREG, GPR183, KYNU, NFKBIA, AZIN1-AS1, VCAN 
## PC_ 4 
## Positive:  LYZ, IL1B, TIMP1, OLR1, IL1RN, S100A8, S100A9, ANXA1, PTGS2, S100A12 
##     HLA-DRA, CTSL, AC020656.1, CCL3, ACSL1, KYNU, RPLP1, FCN1, HSPA1A, SLC2A3 
##     HSP90AA1, VCAN, CD74, EHD1, RPS12, SOCS3, HLA-DRB1, EREG, CXCL2, IER3 
## Negative:  ZEB1, AC006369.1, KATNBL1, PRKCB, CXCR4, PLAU, AC099489.1, RASSF2, PLPP3, CD22 
##     GBE1, CHST11, EPHB1, INPP5A, HIF1A-AS2, RIPOR2, SLC38A1, GLT1D1, SULF2, PPP1R16B 
##     KIAA0319, RHOH, CARD11, BANK1, SNAPC1, SPTBN1, IL1R1, RNF103-CHMP3, PLXNC1, ST6GAL1 
## PC_ 5 
## Positive:  AC090498.1, GPR183, RPS12, HLA-DRA, SERPINB9, CD74, RPLP1, HLA-DRB1, RFTN1, MEF2C 
##     HLA-DPB1, HLA-DPA1, ST8SIA4, TCF4, BCL2, ARID5B, HSP90AA1, ADAM19, RUNX3, PPP1R16B 
##     SAMSN1, PDE4D, MTSS1, SIPA1L1, XYLT1, HLA-DQA1, HSPA1A, SLC2A3, ST6GAL1, ENTPD1 
## Negative:  CCL3L1, CCL4L2, AC006369.1, PLAU, CXCL8, CCL4, CCL3, HCAR3, KATNBL1, MAFF 
##     LUCAT1, FOS, GBE1, CXCL2, NFKBIA, IER3, PLPP3, AC005050.3, HCAR2, IER2 
##     AC099489.1, TLR2, DOCK4, TNFAIP3, AC023157.3, MXD1, TNFAIP6, ZFP36, LSAMP, PI3
#visualize reduced dimensions
VizDimLoadings(cf.1, dims = 1:10, reduction = "pca")

#dimension heat map(first dimension- primary sources for heterogen.)
#cells and features ordered by PCA scores
#DimHeatmap(cf.1, dims = 1:20, cells = 500, balanced = TRUE)

#Determine "dimensionality" of dataset (paper used 12)
ElbowPlot(cf.1, ndims=20, reduction="pca")

Perform streamlined (one-line) integrative analysis

We use rpca here after comparing our results with CCA and Harmony.

# Perform streamlined (one-line) integrative analysis--------------------
CF_integration.1 <- IntegrateLayers(
  object = cf.1, method = RPCAIntegration,
  orig.reduction = "pca", new.reduction = 'integrated.rpca',
  verbose = FALSE)


CF_integration.1 <- FindNeighbors(CF_integration.1, reduction = "integrated.rpca", dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
CF_integration.1 <- FindClusters(CF_integration.1, resolution =c(0.1,0.2,0.3,0.5), cluster.name = c("rpca_clusters_0.1","rpca_clusters_0.2","rpca_clusters_0.3","rpca_clusters_0.5"))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 20495
## Number of edges: 669914
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9648
## Number of communities: 6
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 20495
## Number of edges: 669914
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9418
## Number of communities: 7
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 20495
## Number of edges: 669914
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9252
## Number of communities: 9
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 20495
## Number of edges: 669914
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9018
## Number of communities: 14
## Elapsed time: 3 seconds
CF_integration.1 <- RunUMAP(CF_integration.1, reduction = "integrated.rpca", dims = 1:15, reduction.name = "umap.rpca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 15:18:53 UMAP embedding parameters a = 0.9922 b = 1.112
## 15:18:53 Read 20495 rows and found 15 numeric columns
## 15:18:53 Using Annoy for neighbor search, n_neighbors = 30
## 15:18:53 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:18:56 Writing NN index file to temp file /var/folders/0r/r2dplk4d72d437ygr7ycq8kw0000gn/T//RtmpX7AYON/file612c1e077b0b
## 15:18:56 Searching Annoy index using 1 thread, search_k = 3000
## 15:19:02 Annoy recall = 100%
## 15:19:04 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 15:19:05 Initializing from normalized Laplacian + noise (using RSpectra)
## 15:19:06 Commencing optimization for 200 epochs, with 851070 positive edges
## 15:19:17 Optimization finished
DimPlot(CF_integration.1, reduction = "umap.rpca"
        , group.by = c("rpca_clusters_0.1","rpca_clusters_0.2"
                       ,"rpca_clusters_0.3","rpca_clusters_0.5")
        ,label = T)

# 
# DimPlot(CF_integration.1, reduction = "umap.unintegrated"
#         , group.by = c("rpca_clusters_0.1","rpca_clusters_0.2"
#                        ,"rpca_clusters_0.3","rpca_clusters_0.5")
#         ,label = T)

findConserved markers

# findConserved markers (resolution=0.1)--------------------
CF_integration.1 <- JoinLayers(CF_integration.1)
Idents(CF_integration.1) <- CF_integration.1@meta.data$rpca_clusters_0.1

# cluster0
marker_r1_0 <- FindConservedMarkers(CF_integration.1, ident.1 = 0, grouping.var = "disease.ident", verbose = FALSE)
head(marker_r1_0,20)
#FCGR3B,CXCR2,ALPL -- PMN by paper
VlnPlot(CF_integration.1, features=c("CXCR4", "IGF2R","ALPL"))

VlnPlot(CF_integration.1, features=c("FCGR3B", "CXCR2","ALPL"))

# MRC1 or MARCO typically detected on alvMΦs 
VlnPlot(CF_integration.1, features=c("MRC1", "MARCO"))

After several searches, we name each cluster and plot the named clusters as below:

CF_integration.1 <- RenameIdents(CF_integration.1, `0` = 'PMN')
CF_integration.1 <- RenameIdents(CF_integration.1, `1` = 'AM2')
CF_integration.1 <- RenameIdents(CF_integration.1, `2` = 'M/Momo')# 
CF_integration.1 <- RenameIdents(CF_integration.1, `3` = 'epithelial')
CF_integration.1 <- RenameIdents(CF_integration.1, `4` = 'T/B')
CF_integration.1 <- RenameIdents(CF_integration.1, `5` = 'AM1')
# plot_clustering1 <- DimPlot(CF_integration.1,reduction = c("umap.unintegrated")
#         , label.size = 4,label = T)
plot_clustering2 <- DimPlot(CF_integration.1,reduction = c("umap.rpca")
          , label.size = 4,label = T)
# plot_clustering1|
plot_clustering2

DimPlot(CF_integration.1,reduction = c("umap.rpca")
          ,split.by = 'disease.ident',, label.size = 4,label = T)

findMarkers between CF vs. Control

# findMarkers between conditions ---------------------
CF_integration.1$celltype.cnd <- paste0(CF_integration.1$rpca_clusters_0.1,'_', CF_integration.1$disease.ident)
#View(CF_integration.1@meta.data)
Idents(CF_integration.1) <- CF_integration.1$celltype.cnd
DimPlot(CF_integration.1, reduction = 'umap.rpca', label = TRUE)

# find markers
b.interferon.response <- FindMarkers(CF_integration.1, ident.1 = '1_CF', ident.2 = '1_CTRL')

head(b.interferon.response)
# plotting conserved features vs DE features between conditions
#head(marker_r1_1)
FeaturePlot(CF_integration.1, features = c('G0S2', 'PRKCB', 'ANO5','AC026369.3'), split.by = 'disease.ident', min.cutoff = 'q10')